sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_4.0.3 magrittr_2.0.1 tools_4.0.3 htmltools_0.5.1
## [5] yaml_2.2.1 stringi_1.5.3 rmarkdown_2.6 knitr_1.30
## [9] stringr_1.4.0 xfun_0.20 digest_0.6.27 rlang_0.4.10
## [13] evaluate_0.14
Load tidyverse and other packages for this lecture:
library("tidyverse")
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.5 ✓ dplyr 1.0.3
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library("rvest")
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
##
## pluck
## The following object is masked from 'package:readr':
##
## guess_encoding
library("quantmod")
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
There is a wealth of data on internet. How to scrape them and analyze them?
rvest is an R package written by Hadley Wickham which makes web scraping easy.
We follow instructions in a Blog by SAURAV KAUSHIK to find the most popular feature films of 2019.
Install the SelectorGadget extension for Chrome.
The 100 most popular feature films released in 2019 can be accessed at page https://www.imdb.com/search/title/?title_type=feature&release_date=2019-01-01,2019-12-31&count=100.
#Loading the rvest and tidyverse package
#Specifying the url for desired website to be scraped
url <- "https://www.imdb.com/search/title/?title_type=feature&release_date=2019-01-01,2019-12-31&count=100"
#Reading the HTML code from the website
(webpage <- read_html(url))
## {html_document}
## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body id="styleguide-v2" class="fixed">\n <img height="1" widt ...Suppose we want to scrape following 11 features from this page:
Use the CSS selector to get the rankings
# Use CSS selectors to scrap the rankings section
(rank_data_html <- html_nodes(webpage, '.text-primary'))
## {xml_nodeset (100)}
## [1] <span class="lister-item-index unbold text-primary">1.</span>
## [2] <span class="lister-item-index unbold text-primary">2.</span>
## [3] <span class="lister-item-index unbold text-primary">3.</span>
## [4] <span class="lister-item-index unbold text-primary">4.</span>
## [5] <span class="lister-item-index unbold text-primary">5.</span>
## [6] <span class="lister-item-index unbold text-primary">6.</span>
## [7] <span class="lister-item-index unbold text-primary">7.</span>
## [8] <span class="lister-item-index unbold text-primary">8.</span>
## [9] <span class="lister-item-index unbold text-primary">9.</span>
## [10] <span class="lister-item-index unbold text-primary">10.</span>
## [11] <span class="lister-item-index unbold text-primary">11.</span>
## [12] <span class="lister-item-index unbold text-primary">12.</span>
## [13] <span class="lister-item-index unbold text-primary">13.</span>
## [14] <span class="lister-item-index unbold text-primary">14.</span>
## [15] <span class="lister-item-index unbold text-primary">15.</span>
## [16] <span class="lister-item-index unbold text-primary">16.</span>
## [17] <span class="lister-item-index unbold text-primary">17.</span>
## [18] <span class="lister-item-index unbold text-primary">18.</span>
## [19] <span class="lister-item-index unbold text-primary">19.</span>
## [20] <span class="lister-item-index unbold text-primary">20.</span>
## ...
# Convert the ranking data to text
(rank_data <- html_text(rank_data_html))
## [1] "1." "2." "3." "4." "5." "6." "7." "8." "9." "10."
## [11] "11." "12." "13." "14." "15." "16." "17." "18." "19." "20."
## [21] "21." "22." "23." "24." "25." "26." "27." "28." "29." "30."
## [31] "31." "32." "33." "34." "35." "36." "37." "38." "39." "40."
## [41] "41." "42." "43." "44." "45." "46." "47." "48." "49." "50."
## [51] "51." "52." "53." "54." "55." "56." "57." "58." "59." "60."
## [61] "61." "62." "63." "64." "65." "66." "67." "68." "69." "70."
## [71] "71." "72." "73." "74." "75." "76." "77." "78." "79." "80."
## [81] "81." "82." "83." "84." "85." "86." "87." "88." "89." "90."
## [91] "91." "92." "93." "94." "95." "96." "97." "98." "99." "100."
# Turn into numerical values
(rank_data <- as.integer(rank_data))
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## [19] 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## [37] 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## [55] 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## [73] 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## [91] 91 92 93 94 95 96 97 98 99 100Use SelectorGadget to find the CSS selector .lister-item-header a.
# Using CSS selectors to scrap the title section
(title_data_html <- html_nodes(webpage, '.lister-item-header a'))
## {xml_nodeset (100)}
## [1] <a href="/title/tt3741700/?ref_=adv_li_tt">Godzilla: King of the Monster ...
## [2] <a href="/title/tt9608818/?ref_=adv_li_tt">Our Friend</a>
## [3] <a href="/title/tt4154796/?ref_=adv_li_tt">Avengers: Endgame</a>
## [4] <a href="/title/tt4154664/?ref_=adv_li_tt">Captain Marvel</a>
## [5] <a href="/title/tt8367814/?ref_=adv_li_tt">The Gentlemen</a>
## [6] <a href="/title/tt6751668/?ref_=adv_li_tt">Parasite</a>
## [7] <a href="/title/tt5363618/?ref_=adv_li_tt">Sound of Metal</a>
## [8] <a href="/title/tt7131622/?ref_=adv_li_tt">Once Upon a Time... In Hollyw ...
## [9] <a href="/title/tt9016974/?ref_=adv_li_tt">Synchronic</a>
## [10] <a href="/title/tt8946378/?ref_=adv_li_tt">Knives Out</a>
## [11] <a href="/title/tt8579674/?ref_=adv_li_tt">1917</a>
## [12] <a href="/title/tt6394270/?ref_=adv_li_tt">Bombshell</a>
## [13] <a href="/title/tt7286456/?ref_=adv_li_tt">Joker</a>
## [14] <a href="/title/tt2527338/?ref_=adv_li_tt">Star Wars: The Rise Of Skywal ...
## [15] <a href="/title/tt8772262/?ref_=adv_li_tt">Midsommar</a>
## [16] <a href="/title/tt2584384/?ref_=adv_li_tt">Jojo Rabbit</a>
## [17] <a href="/title/tt3281548/?ref_=adv_li_tt">Little Women</a>
## [18] <a href="/title/tt1950186/?ref_=adv_li_tt">Ford v Ferrari</a>
## [19] <a href="/title/tt10195452/?ref_=adv_li_tt">American Skin</a>
## [20] <a href="/title/tt1302006/?ref_=adv_li_tt">The Irishman</a>
## ...
# Converting the title data to text
(title_data <- html_text(title_data_html))
## [1] "Godzilla: King of the Monsters"
## [2] "Our Friend"
## [3] "Avengers: Endgame"
## [4] "Captain Marvel"
## [5] "The Gentlemen"
## [6] "Parasite"
## [7] "Sound of Metal"
## [8] "Once Upon a Time... In Hollywood"
## [9] "Synchronic"
## [10] "Knives Out"
## [11] "1917"
## [12] "Bombshell"
## [13] "Joker"
## [14] "Star Wars: The Rise Of Skywalker"
## [15] "Midsommar"
## [16] "Jojo Rabbit"
## [17] "Little Women"
## [18] "Ford v Ferrari"
## [19] "American Skin"
## [20] "The Irishman"
## [21] "Serenity"
## [22] "The Lighthouse"
## [23] "After"
## [24] "Doctor Sleep"
## [25] "Rocketman"
## [26] "Uncut Gems"
## [27] "Jumanji: The Next Level"
## [28] "The Peanut Butter Falcon"
## [29] "Spider-Man: Far from Home"
## [30] "Richard Jewell"
## [31] "The Lion King"
## [32] "Dark Waters"
## [33] "Terminator: Dark Fate"
## [34] "Glass"
## [35] "Don't Let Go"
## [36] "The King"
## [37] "The Outpost"
## [38] "The Platform"
## [39] "6 Underground"
## [40] "Alita: Battle Angel"
## [41] "Cats"
## [42] "Booksmart"
## [43] "Aladdin"
## [44] "Fast & Furious Presents: Hobbs & Shaw"
## [45] "Us"
## [46] "Portrait of a Lady on Fire"
## [47] "John Wick: Chapter 3 - Parabellum"
## [48] "Escape Room"
## [49] "Toy Story 4"
## [50] "I See You"
## [51] "Shazam!"
## [52] "Midway"
## [53] "Seberg"
## [54] "The Professor and the Madman"
## [55] "Hustlers"
## [56] "Ad Astra"
## [57] "It Chapter Two"
## [58] "Motherless Brooklyn"
## [59] "Ready or Not"
## [60] "Downton Abbey"
## [61] "X-Men: Dark Phoenix"
## [62] "Charlie's Angels"
## [63] "Marriage Story"
## [64] "Ma"
## [65] "The Assistant"
## [66] "Saint Maud"
## [67] "Extremely Wicked, Shockingly Evil and Vile"
## [68] "Gemini Man"
## [69] "The Informer"
## [70] "Babyteeth"
## [71] "The Lodge"
## [72] "Official Secrets"
## [73] "Frozen II"
## [74] "El Camino: A Breaking Bad Movie"
## [75] "The Vast of Night"
## [76] "The Good Liar"
## [77] "Just Mercy"
## [78] "First Cow"
## [79] "Polar"
## [80] "Yesterday"
## [81] "Vivarium"
## [82] "Color Out of Space"
## [83] "A Beautiful Day in the Neighborhood"
## [84] "The Highwaymen"
## [85] "The Dead Don't Die"
## [86] "Skyfire"
## [87] "Angel Has Fallen"
## [88] "Zombieland: Double Tap"
## [89] "21 Bridges"
## [90] "Pinocchio"
## [91] "Triple Frontier"
## [92] "Five Feet Apart"
## [93] "The Personal History of David Copperfield"
## [94] "Crawl"
## [95] "Bad Education"
## [96] "Honey Boy"
## [97] "Anna"
## [98] "Fighting with My Family"
## [99] "Maleficent: Mistress of Evil"
## [100] "Dreamland"# Using CSS selectors to scrap the description section
(description_data_html <- html_nodes(webpage, '.ratings-bar+ .text-muted'))
## {xml_nodeset (100)}
## [1] <p class="text-muted">\n The crypto-zoological agency Monarch faces o ...
## [2] <p class="text-muted">\n After receiving life-altering news, a couple ...
## [3] <p class="text-muted">\n After the devastating events of <a href="/ti ...
## [4] <p class="text-muted">\n Carol Danvers becomes one of the universe's ...
## [5] <p class="text-muted">\n An American expat tries to sell off his high ...
## [6] <p class="text-muted">\n Greed and class discrimination threaten the ...
## [7] <p class="text-muted">\n A heavy-metal drummer's life is thrown into ...
## [8] <p class="text-muted">\n A faded television actor and his stunt doubl ...
## [9] <p class="text-muted">\n Two New Orleans paramedics' lives are ripped ...
## [10] <p class="text-muted">\n A detective investigates the death of a patr ...
## [11] <p class="text-muted">\n April 6th, 1917. As a regiment assembles to ...
## [12] <p class="text-muted">\n A group of women take on Fox News head <a hr ...
## [13] <p class="text-muted">\n In Gotham City, mentally troubled comedian A ...
## [14] <p class="text-muted">\n The surviving members of the resistance face ...
## [15] <p class="text-muted">\n A couple travels to Northern Europe to visit ...
## [16] <p class="text-muted">\n A young boy in Hitler's army finds out his m ...
## [17] <p class="text-muted">\n Jo March reflects back and forth on her life ...
## [18] <p class="text-muted">\n American car designer <a href="/name/nm07909 ...
## [19] <p class="text-muted">\n A Marine veteran working as a school janitor ...
## [20] <p class="text-muted">\n An old man recalls his time painting houses ...
## ...
# Converting the description data to text
description_data <- html_text(description_data_html)
# take a look at first few
head(description_data)
## [1] "\n The crypto-zoological agency Monarch faces off against a battery of god-sized monsters, including the mighty Godzilla, who collides with Mothra, Rodan, and his ultimate nemesis, the three-headed King Ghidorah."
## [2] "\n After receiving life-altering news, a couple finds unexpected support from their best friend, who puts his own life on hold and moves into their family home, bringing an impact much greater and more profound than anyone could have imagined"
## [3] "\n After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe."
## [4] "\n Carol Danvers becomes one of the universe's most powerful heroes when Earth is caught in the middle of a galactic war between two alien races."
## [5] "\n An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."
## [6] "\n Greed and class discrimination threaten the newly formed symbiotic relationship between the wealthy Park family and the destitute Kim clan."
# strip the '\n'
description_data <- str_replace(description_data, "^\\n\\s+", "")
head(description_data)
## [1] "The crypto-zoological agency Monarch faces off against a battery of god-sized monsters, including the mighty Godzilla, who collides with Mothra, Rodan, and his ultimate nemesis, the three-headed King Ghidorah."
## [2] "After receiving life-altering news, a couple finds unexpected support from their best friend, who puts his own life on hold and moves into their family home, bringing an impact much greater and more profound than anyone could have imagined"
## [3] "After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe."
## [4] "Carol Danvers becomes one of the universe's most powerful heroes when Earth is caught in the middle of a galactic war between two alien races."
## [5] "An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."
## [6] "Greed and class discrimination threaten the newly formed symbiotic relationship between the wealthy Park family and the destitute Kim clan."# Using CSS selectors to scrap the Movie runtime section
(runtime_data <- webpage %>%
html_nodes('.runtime') %>%
html_text() %>%
str_replace(" min", "") %>%
as.integer())
## [1] 132 124 181 123 113 132 120 161 102 130 119 109 122 141 148 108 135 152
## [19] 89 209 106 109 105 152 121 135 123 97 129 131 118 126 128 129 103 140
## [37] 123 94 128 122 110 102 128 137 116 122 130 99 100 98 132 138 102 124
## [55] 110 123 169 144 95 122 113 118 137 99 87 84 110 117 113 118 108 112
## [73] 103 122 91 109 137 122 118 116 97 111 109 132 104 97 121 99 99 125
## [91] 125 116 119 87 108 94 118 108 119 98
Collect the (first) genre of each movie:
# Using CSS selectors to scrap the Movie genre section
genre_data_html <- html_nodes(webpage, '.genre')
# Converting the genre data to text
genre_data <- html_text(genre_data_html)
# Let's have a look at the genre data
head(genre_data)
## [1] "\nAction, Adventure, Fantasy "
## [2] "\nDrama "
## [3] "\nAction, Adventure, Drama "
## [4] "\nAction, Adventure, Sci-Fi "
## [5] "\nAction, Comedy, Crime "
## [6] "\nComedy, Drama, Thriller "
# Data-Preprocessing: retrieve the first word
genre_data <- str_extract(genre_data, "[:alpha:]+")
# Convering each genre from text to factor
#genre_data <- as.factor(genre_data)
# Let's have another look at the genre data
head(genre_data)
## [1] "Action" "Drama" "Action" "Action" "Action" "Comedy"# Using CSS selectors to scrap the IMDB rating section
rating_data_html <- html_nodes(webpage, '.ratings-imdb-rating strong')
# Converting the ratings data to text
rating_data <- html_text(rating_data_html)
# Let's have a look at the ratings
head(rating_data)
## [1] "6.0" "7.2" "8.4" "6.9" "7.8" "8.6"
# Data-Preprocessing: converting ratings to numerical
rating_data <- as.numeric(rating_data)
# Let's have another look at the ratings data
rating_data
## [1] 6.0 7.2 8.4 6.9 7.8 8.6 7.8 7.6 6.2 7.9 8.3 6.8 8.5 6.6 7.1 7.9 7.8 8.1
## [19] 6.0 7.9 5.4 7.5 5.3 7.3 7.3 7.4 6.7 7.6 7.5 7.5 6.9 7.6 6.2 6.7 6.3 7.2
## [37] 6.8 7.0 6.1 7.3 2.8 7.2 6.9 6.4 6.8 8.1 7.4 6.4 7.8 6.8 7.0 6.7 5.7 7.3
## [55] 6.3 6.5 6.5 6.8 6.8 7.4 5.7 4.8 7.9 5.6 6.2 7.0 6.6 5.7 6.6 7.2 6.0 7.3
## [73] 6.9 7.3 6.7 6.6 7.6 7.1 6.3 6.8 5.8 6.2 7.3 6.9 5.5 5.3 6.4 6.7 6.6 6.2
## [91] 6.4 7.2 6.4 6.1 7.1 7.3 6.6 7.1 6.6 5.8# Using CSS selectors to scrap the votes section
votes_data_html <- html_nodes(webpage, '.sort-num_votes-visible span:nth-child(2)')
# Converting the votes data to text
votes_data <- html_text(votes_data_html)
# Let's have a look at the votes data
head(votes_data)
## [1] "134,259" "1,437" "810,893" "442,500" "237,935" "553,703"
# Data-Preprocessing: removing commas
votes_data <- str_replace(votes_data, ",", "")
# Data-Preprocessing: converting votes to numerical
votes_data <- as.numeric(votes_data)
#Let's have another look at the votes data
votes_data
## [1] 134259 1437 810893 442500 237935 553703 27602 551932 7253 454854
## [11] 426526 89623 940168 374453 201112 298298 143417 291635 1867 324980
## [21] 35928 143042 38219 143698 137492 220796 188125 66449 324470 62479
## [31] 212343 60542 149742 208389 6458 89147 19241 164642 139547 228421
## [41] 42303 92950 227804 167173 227434 63258 275709 87394 203359 29972
## [51] 255823 66473 5793 35530 83873 199164 207878 43507 110013 39813
## [61] 155646 57265 246835 39868 14488 3273 79188 86351 20862 9317
## [71] 32133 33797 136991 183797 27700 26412 46813 8686 75564 113311
## [81] 38582 32875 61222 75275 53157 868 79163 139759 50029 6817
## [91] 108048 43694 12874 68014 31549 28073 61334 66632 85184 2405# Using CSS selectors to scrap the directors section
(directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)'))
## {xml_nodeset (100)}
## [1] <a href="/name/nm1002424/?ref_=adv_li_dr_0">Michael Dougherty</a>
## [2] <a href="/name/nm1363250/?ref_=adv_li_dr_0">Gabriela Cowperthwaite</a>
## [3] <a href="/name/nm0751577/?ref_=adv_li_dr_0">Anthony Russo</a>
## [4] <a href="/name/nm1349818/?ref_=adv_li_dr_0">Anna Boden</a>
## [5] <a href="/name/nm0005363/?ref_=adv_li_dr_0">Guy Ritchie</a>
## [6] <a href="/name/nm0094435/?ref_=adv_li_dr_0">Bong Joon Ho</a>
## [7] <a href="/name/nm2942187/?ref_=adv_li_dr_0">Darius Marder</a>
## [8] <a href="/name/nm0000233/?ref_=adv_li_dr_0">Quentin Tarantino</a>
## [9] <a href="/name/nm1918140/?ref_=adv_li_dr_0">Justin Benson</a>
## [10] <a href="/name/nm0426059/?ref_=adv_li_dr_0">Rian Johnson</a>
## [11] <a href="/name/nm0005222/?ref_=adv_li_dr_0">Sam Mendes</a>
## [12] <a href="/name/nm0005366/?ref_=adv_li_dr_0">Jay Roach</a>
## [13] <a href="/name/nm0680846/?ref_=adv_li_dr_0">Todd Phillips</a>
## [14] <a href="/name/nm0009190/?ref_=adv_li_dr_0">J.J. Abrams</a>
## [15] <a href="/name/nm4170048/?ref_=adv_li_dr_0">Ari Aster</a>
## [16] <a href="/name/nm0169806/?ref_=adv_li_dr_0">Taika Waititi</a>
## [17] <a href="/name/nm1950086/?ref_=adv_li_dr_0">Greta Gerwig</a>
## [18] <a href="/name/nm0003506/?ref_=adv_li_dr_0">James Mangold</a>
## [19] <a href="/name/nm1676649/?ref_=adv_li_dr_0">Nate Parker</a>
## [20] <a href="/name/nm0000217/?ref_=adv_li_dr_0">Martin Scorsese</a>
## ...
# Converting the directors data to text
directors_data <- html_text(directors_data_html)
# Let's have a look at the directors data
directors_data
## [1] "Michael Dougherty" "Gabriela Cowperthwaite" "Anthony Russo"
## [4] "Anna Boden" "Guy Ritchie" "Bong Joon Ho"
## [7] "Darius Marder" "Quentin Tarantino" "Justin Benson"
## [10] "Rian Johnson" "Sam Mendes" "Jay Roach"
## [13] "Todd Phillips" "J.J. Abrams" "Ari Aster"
## [16] "Taika Waititi" "Greta Gerwig" "James Mangold"
## [19] "Nate Parker" "Martin Scorsese" "Steven Knight"
## [22] "Robert Eggers" "Jenny Gage" "Mike Flanagan"
## [25] "Dexter Fletcher" "Benny Safdie" "Jake Kasdan"
## [28] "Tyler Nilson" "Jon Watts" "Clint Eastwood"
## [31] "Jon Favreau" "Todd Haynes" "Tim Miller"
## [34] "M. Night Shyamalan" "Jacob Estes" "David Michôd"
## [37] "Rod Lurie" "Galder Gaztelu-Urrutia" "Michael Bay"
## [40] "Robert Rodriguez" "Tom Hooper" "Olivia Wilde"
## [43] "Guy Ritchie" "David Leitch" "Jordan Peele"
## [46] "Céline Sciamma" "Chad Stahelski" "Adam Robitel"
## [49] "Josh Cooley" "Adam Randall" "David F. Sandberg"
## [52] "Roland Emmerich" "Benedict Andrews" "Farhad Safinia"
## [55] "Lorene Scafaria" "James Gray" "Andy Muschietti"
## [58] "Edward Norton" "Matt Bettinelli-Olpin" "Michael Engler"
## [61] "Simon Kinberg" "Elizabeth Banks" "Noah Baumbach"
## [64] "Tate Taylor" "Kitty Green" "Rose Glass"
## [67] "Joe Berlinger" "Ang Lee" "Andrea Di Stefano"
## [70] "Shannon Murphy" "Severin Fiala" "Gavin Hood"
## [73] "Chris Buck" "Vince Gilligan" "Andrew Patterson"
## [76] "Bill Condon" "Destin Daniel Cretton" "Kelly Reichardt"
## [79] "Jonas Åkerlund" "Danny Boyle" "Lorcan Finnegan"
## [82] "Richard Stanley" "Marielle Heller" "John Lee Hancock"
## [85] "Jim Jarmusch" "Simon West" "Ric Roman Waugh"
## [88] "Ruben Fleischer" "Brian Kirk" "Matteo Garrone"
## [91] "J.C. Chandor" "Justin Baldoni" "Armando Iannucci"
## [94] "Alexandre Aja" "Cory Finley" "Alma Har'el"
## [97] "Luc Besson" "Stephen Merchant" "Joachim Rønning"
## [100] "Miles Joris-Peyrafitte"# Using CSS selectors to scrap the actors section
(actors_data_html <- html_nodes(webpage, '.lister-item-content .ghost+ a'))
## {xml_nodeset (100)}
## [1] <a href="/name/nm0151419/?ref_=adv_li_st_0">Kyle Chandler</a>
## [2] <a href="/name/nm0781981/?ref_=adv_li_st_0">Jason Segel</a>
## [3] <a href="/name/nm0000375/?ref_=adv_li_st_0">Robert Downey Jr.</a>
## [4] <a href="/name/nm0488953/?ref_=adv_li_st_0">Brie Larson</a>
## [5] <a href="/name/nm0000190/?ref_=adv_li_st_0">Matthew McConaughey</a>
## [6] <a href="/name/nm0814280/?ref_=adv_li_st_0">Kang-ho Song</a>
## [7] <a href="/name/nm1981893/?ref_=adv_li_st_0">Riz Ahmed</a>
## [8] <a href="/name/nm0000138/?ref_=adv_li_st_0">Leonardo DiCaprio</a>
## [9] <a href="/name/nm1107001/?ref_=adv_li_st_0">Anthony Mackie</a>
## [10] <a href="/name/nm0185819/?ref_=adv_li_st_0">Daniel Craig</a>
## [11] <a href="/name/nm2835616/?ref_=adv_li_st_0">Dean-Charles Chapman</a>
## [12] <a href="/name/nm0000234/?ref_=adv_li_st_0">Charlize Theron</a>
## [13] <a href="/name/nm0001618/?ref_=adv_li_st_0">Joaquin Phoenix</a>
## [14] <a href="/name/nm5397459/?ref_=adv_li_st_0">Daisy Ridley</a>
## [15] <a href="/name/nm6073955/?ref_=adv_li_st_0">Florence Pugh</a>
## [16] <a href="/name/nm9877392/?ref_=adv_li_st_0">Roman Griffin Davis</a>
## [17] <a href="/name/nm1519680/?ref_=adv_li_st_0">Saoirse Ronan</a>
## [18] <a href="/name/nm0000354/?ref_=adv_li_st_0">Matt Damon</a>
## [19] <a href="/name/nm1165044/?ref_=adv_li_st_0">Omari Hardwick</a>
## [20] <a href="/name/nm0000134/?ref_=adv_li_st_0">Robert De Niro</a>
## ...
# Converting the gross actors data to text
actors_data <- html_text(actors_data_html)
# Let's have a look at the actors data
head(actors_data)
## [1] "Kyle Chandler" "Jason Segel" "Robert Downey Jr."
## [4] "Brie Larson" "Matthew McConaughey" "Kang-ho Song"Be careful with missing data.
# Using CSS selectors to scrap the metascore section
metascore_data_html <- html_nodes(webpage, '.metascore')
# Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)
# Let's have a look at the metascore
head(metascore_data)
## [1] "48 " "57 " "78 " "64 " "51 "
## [6] "96 "
# Data-Preprocessing: removing extra space in metascore
metascore_data <- str_replace(metascore_data, "\\s*$", "")
metascore_data <- as.numeric(metascore_data)
metascore_data
## [1] 48 57 78 64 51 96 81 83 63 82 78 64 59 53 72 58 91 81 24 94 37 83 30 59 69
## [26] 91 58 70 69 68 55 73 54 43 49 62 71 73 41 53 32 84 53 60 81 95 73 48 84 65
## [51] 71 47 54 27 79 80 58 60 64 64 43 52 94 53 79 84 52 38 61 77 64 63 64 72 84
## [76] 55 68 89 19 55 64 70 80 58 53 47 45 55 51 64 61 53 77 60 79 73 40 68 43 57
# Lets check the length of metascore data
length(metascore_data)
## [1] 100
# # Visual inspection finds 24, 85, 100 don't have metascore
# ms <- rep(NA, 100)
# ms[-c(24, 85, 100)] <- metascore_data
# (metascore_data <- ms)Be careful with missing data.
# Using CSS selectors to scrap the gross revenue section
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
# Converting the gross revenue data to text
gross_data <- html_text(gross_data_html)
# Let's have a look at the gross data
head(gross_data)
## [1] "$110.50M" "$858.37M" "$426.83M" "$53.37M" "$142.50M" "$165.36M"
# Data-Preprocessing: removing '$' and 'M' signs
gross_data <- str_replace(gross_data, "M", "")
gross_data <- str_sub(gross_data, 2, 10)
#(gross_data <- str_extract(gross_data, "[:digit:]+.[:digit:]+"))
gross_data <- as.numeric(gross_data)
# Let's check the length of gross data
length(gross_data)
## [1] 56
# Visual inspection finds below movies don't have gross
#gs_data <- rep(NA, 100)
#gs_data[-c(1, 2, 3, 5, 61, 69, 71, 74, 78, 82, 84:87, 90)] <- gross_data
#(gross_data <- gs_data)
44 (out of 100) movies don’t have gross data yet! We need a better way to figure out missing entries.
(rank_and_gross <- webpage %>%
# retrieve rank and gross
html_nodes('.ghost~ .text-muted+ span , .text-primary') %>%
html_text() %>%
str_replace("\\s+", "") %>%
str_replace_all("[$M]", ""))
## [1] "1." "110.50" "2." "3." "858.37" "4." "426.83" "5."
## [9] "6." "53.37" "7." "8." "142.50" "9." "10." "165.36"
## [17] "11." "159.23" "12." "13." "335.45" "14." "515.20" "15."
## [25] "27.33" "16." "0.35" "17." "108.10" "18." "117.62" "19."
## [33] "20." "7.00" "21." "8.55" "22." "0.43" "23." "12.14"
## [41] "24." "25." "96.37" "26." "27." "316.83" "28." "13.12"
## [49] "29." "390.53" "30." "31." "543.64" "32." "33." "62.25"
## [57] "34." "111.05" "35." "4.69" "36." "37." "38." "39."
## [65] "40." "85.71" "41." "42." "22.68" "43." "355.56" "44."
## [73] "173.96" "45." "175.08" "46." "3.76" "47." "171.02" "48."
## [81] "57.01" "49." "434.04" "50." "51." "140.37" "52." "53."
## [89] "54." "55." "80.55" "56." "35.40" "57." "211.59" "58."
## [97] "59." "26.74" "60." "96.85" "61." "65.85" "62." "63."
## [105] "2.00" "64." "45.37" "65." "66." "67." "68." "20.55"
## [113] "69." "70." "71." "72." "0.40" "73." "477.37" "74."
## [121] "75." "76." "77." "78." "79." "80." "73.29" "81."
## [129] "82." "83." "61.70" "84." "85." "6.56" "86." "87."
## [137] "67.16" "88." "26.80" "89." "90." "91." "92." "45.73"
## [145] "93." "94." "39.01" "95." "96." "97." "7.74" "98."
## [153] "22.96" "99." "113.93" "100."
isrank <- str_detect(rank_and_gross, "\\.$")
ismissing <- isrank[1:(length(rank_and_gross) - 1)] & isrank[2:(length(rank_and_gross))]
ismissing[length(ismissing)+1] <- isrank[length(isrank)]
missingpos <- as.integer(rank_and_gross[ismissing])
gs_data <- rep(NA, 100)
gs_data[-missingpos] <- gross_data
(gross_data <- gs_data)
## [1] 110.50 NA 858.37 426.83 NA 53.37 NA 142.50 NA 165.36
## [11] 159.23 NA 335.45 515.20 27.33 0.35 108.10 117.62 NA 7.00
## [21] 8.55 0.43 12.14 NA 96.37 NA 316.83 13.12 390.53 NA
## [31] 543.64 NA 62.25 111.05 4.69 NA NA NA NA 85.71
## [41] NA 22.68 355.56 173.96 175.08 3.76 171.02 57.01 434.04 NA
## [51] 140.37 NA NA NA 80.55 35.40 211.59 NA 26.74 96.85
## [61] 65.85 NA 2.00 45.37 NA NA NA 20.55 NA NA
## [71] NA 0.40 477.37 NA NA NA NA NA NA 73.29
## [81] NA NA 61.70 NA 6.56 NA 67.16 26.80 NA NA
## [91] NA 45.73 NA 39.01 NA NA 7.74 22.96 113.93 NAFollowing code programatically figures out missing entries for metascore.
# Use CSS selectors to scrap the rankings section
(rank_metascore_data_html <- html_nodes(webpage, '.unfavorable , .favorable , .mixed , .text-primary'))
## {xml_nodeset (200)}
## [1] <span class="lister-item-index unbold text-primary">1.</span>
## [2] <span class="metascore mixed">48 </span>
## [3] <span class="lister-item-index unbold text-primary">2.</span>
## [4] <span class="metascore mixed">57 </span>
## [5] <span class="lister-item-index unbold text-primary">3.</span>
## [6] <span class="metascore favorable">78 </span>
## [7] <span class="lister-item-index unbold text-primary">4.</span>
## [8] <span class="metascore favorable">64 </span>
## [9] <span class="lister-item-index unbold text-primary">5.</span>
## [10] <span class="metascore mixed">51 </span>
## [11] <span class="lister-item-index unbold text-primary">6.</span>
## [12] <span class="metascore favorable">96 </span>
## [13] <span class="lister-item-index unbold text-primary">7.</span>
## [14] <span class="metascore favorable">81 </span>
## [15] <span class="lister-item-index unbold text-primary">8.</span>
## [16] <span class="metascore favorable">83 </span>
## [17] <span class="lister-item-index unbold text-primary">9.</span>
## [18] <span class="metascore favorable">63 </span>
## [19] <span class="lister-item-index unbold text-primary">10.</span>
## [20] <span class="metascore favorable">82 </span>
## ...
# Convert the ranking data to text
(rank_metascore_data <- html_text(rank_metascore_data_html))
## [1] "1." "48 " "2." "57 " "3."
## [6] "78 " "4." "64 " "5." "51 "
## [11] "6." "96 " "7." "81 " "8."
## [16] "83 " "9." "63 " "10." "82 "
## [21] "11." "78 " "12." "64 " "13."
## [26] "59 " "14." "53 " "15." "72 "
## [31] "16." "58 " "17." "91 " "18."
## [36] "81 " "19." "24 " "20." "94 "
## [41] "21." "37 " "22." "83 " "23."
## [46] "30 " "24." "59 " "25." "69 "
## [51] "26." "91 " "27." "58 " "28."
## [56] "70 " "29." "69 " "30." "68 "
## [61] "31." "55 " "32." "73 " "33."
## [66] "54 " "34." "43 " "35." "49 "
## [71] "36." "62 " "37." "71 " "38."
## [76] "73 " "39." "41 " "40." "53 "
## [81] "41." "32 " "42." "84 " "43."
## [86] "53 " "44." "60 " "45." "81 "
## [91] "46." "95 " "47." "73 " "48."
## [96] "48 " "49." "84 " "50." "65 "
## [101] "51." "71 " "52." "47 " "53."
## [106] "54 " "54." "27 " "55." "79 "
## [111] "56." "80 " "57." "58 " "58."
## [116] "60 " "59." "64 " "60." "64 "
## [121] "61." "43 " "62." "52 " "63."
## [126] "94 " "64." "53 " "65." "79 "
## [131] "66." "84 " "67." "52 " "68."
## [136] "38 " "69." "61 " "70." "77 "
## [141] "71." "64 " "72." "63 " "73."
## [146] "64 " "74." "72 " "75." "84 "
## [151] "76." "55 " "77." "68 " "78."
## [156] "89 " "79." "19 " "80." "55 "
## [161] "81." "64 " "82." "70 " "83."
## [166] "80 " "84." "58 " "85." "53 "
## [171] "86." "47 " "87." "45 " "88."
## [176] "55 " "89." "51 " "90." "64 "
## [181] "91." "61 " "92." "53 " "93."
## [186] "77 " "94." "60 " "95." "79 "
## [191] "96." "73 " "97." "40 " "98."
## [196] "68 " "99." "43 " "100." "57 "
# Strip spaces
(rank_metascore_data <- str_replace(rank_metascore_data, "\\s+", ""))
## [1] "1." "48" "2." "57" "3." "78" "4." "64" "5." "51"
## [11] "6." "96" "7." "81" "8." "83" "9." "63" "10." "82"
## [21] "11." "78" "12." "64" "13." "59" "14." "53" "15." "72"
## [31] "16." "58" "17." "91" "18." "81" "19." "24" "20." "94"
## [41] "21." "37" "22." "83" "23." "30" "24." "59" "25." "69"
## [51] "26." "91" "27." "58" "28." "70" "29." "69" "30." "68"
## [61] "31." "55" "32." "73" "33." "54" "34." "43" "35." "49"
## [71] "36." "62" "37." "71" "38." "73" "39." "41" "40." "53"
## [81] "41." "32" "42." "84" "43." "53" "44." "60" "45." "81"
## [91] "46." "95" "47." "73" "48." "48" "49." "84" "50." "65"
## [101] "51." "71" "52." "47" "53." "54" "54." "27" "55." "79"
## [111] "56." "80" "57." "58" "58." "60" "59." "64" "60." "64"
## [121] "61." "43" "62." "52" "63." "94" "64." "53" "65." "79"
## [131] "66." "84" "67." "52" "68." "38" "69." "61" "70." "77"
## [141] "71." "64" "72." "63" "73." "64" "74." "72" "75." "84"
## [151] "76." "55" "77." "68" "78." "89" "79." "19" "80." "55"
## [161] "81." "64" "82." "70" "83." "80" "84." "58" "85." "53"
## [171] "86." "47" "87." "45" "88." "55" "89." "51" "90." "64"
## [181] "91." "61" "92." "53" "93." "77" "94." "60" "95." "79"
## [191] "96." "73" "97." "40" "98." "68" "99." "43" "100." "57"
# a rank followed by another rank means the metascore for the 1st rank is missing
(isrank <- str_detect(rank_metascore_data, "\\.$"))
## [1] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [13] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [25] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [37] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [49] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [61] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [73] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [85] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [97] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [109] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [121] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [133] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [145] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [157] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [169] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [181] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [193] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
ismissing <- isrank[1:length(rank_metascore_data)-1] &
isrank[2:length(rank_metascore_data)]
ismissing[length(ismissing)+1] <- isrank[length(isrank)]
(missingpos <- as.integer(rank_metascore_data[ismissing]))
## integer(0)
#(rank_metascore_data <- as.integer(rank_metascore_data))You (students) should work out the code for finding missing positions for gross.
Form a tibble:
# Combining all the lists to form a data frame
movies <- tibble(Rank = rank_data,
Title = title_data,
Description = description_data,
Runtime = runtime_data,
Genre = genre_data,
Rating = rating_data,
Metascore = metascore_data,
Votes = votes_data,
Gross_Earning_in_Mil = gross_data,
Director = directors_data,
Actor = actors_data)
movies %>% print(width=Inf)
## # A tibble: 100 x 11
## Rank Title
## <int> <chr>
## 1 1 Godzilla: King of the Monsters
## 2 2 Our Friend
## 3 3 Avengers: Endgame
## 4 4 Captain Marvel
## 5 5 The Gentlemen
## 6 6 Parasite
## 7 7 Sound of Metal
## 8 8 Once Upon a Time... In Hollywood
## 9 9 Synchronic
## 10 10 Knives Out
## Description
## <chr>
## 1 The crypto-zoological agency Monarch faces off against a battery of god-size…
## 2 After receiving life-altering news, a couple finds unexpected support from t…
## 3 After the devastating events of Avengers: Infinity War (2018), the universe …
## 4 Carol Danvers becomes one of the universe's most powerful heroes when Earth …
## 5 An American expat tries to sell off his highly profitable marijuana empire i…
## 6 Greed and class discrimination threaten the newly formed symbiotic relations…
## 7 A heavy-metal drummer's life is thrown into freefall when he begins to lose …
## 8 A faded television actor and his stunt double strive to achieve fame and suc…
## 9 Two New Orleans paramedics' lives are ripped apart after they encounter a se…
## 10 A detective investigates the death of a patriarch of an eccentric, combative…
## Runtime Genre Rating Metascore Votes Gross_Earning_in_Mil
## <int> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 132 Action 6 48 134259 110.
## 2 124 Drama 7.2 57 1437 NA
## 3 181 Action 8.4 78 810893 858.
## 4 123 Action 6.9 64 442500 427.
## 5 113 Action 7.8 51 237935 NA
## 6 132 Comedy 8.6 96 553703 53.4
## 7 120 Drama 7.8 81 27602 NA
## 8 161 Comedy 7.6 83 551932 142.
## 9 102 Drama 6.2 63 7253 NA
## 10 130 Comedy 7.9 82 454854 165.
## Director Actor
## <chr> <chr>
## 1 Michael Dougherty Kyle Chandler
## 2 Gabriela Cowperthwaite Jason Segel
## 3 Anthony Russo Robert Downey Jr.
## 4 Anna Boden Brie Larson
## 5 Guy Ritchie Matthew McConaughey
## 6 Bong Joon Ho Kang-ho Song
## 7 Darius Marder Riz Ahmed
## 8 Quentin Tarantino Leonardo DiCaprio
## 9 Justin Benson Anthony Mackie
## 10 Rian Johnson Daniel Craig
## # … with 90 more rowsHow many top 100 movies are in each genre? (Be careful with interpretation.)
movies %>%
ggplot() +
geom_bar(mapping = aes(x = Genre))
Which genre is most profitable in terms of average gross earnings?
movies %>%
group_by(Genre) %>%
summarise(avg_earning = mean(Gross_Earning_in_Mil, na.rm=TRUE)) %>%
ggplot() +
geom_col(mapping = aes(x = Genre, y = avg_earning)) +
labs(y = "avg earning in millions")
ggplot(data = movies) +
geom_boxplot(mapping = aes(x = Genre, y = Gross_Earning_in_Mil)) +
labs(y = "Gross earning in millions")
## Warning: Removed 44 rows containing non-finite values (stat_boxplot).
Is there a relationship between gross earning and rating? Find the best selling movie (by gross earning) in each genre
library("ggrepel")
(best_in_genre <- movies %>%
group_by(Genre) %>%
filter(row_number(desc(Gross_Earning_in_Mil)) == 1)) %>%
print(width = Inf)
## # A tibble: 8 x 11
## # Groups: Genre [8]
## Rank Title
## <int> <chr>
## 1 3 Avengers: Endgame
## 2 10 Knives Out
## 3 13 Joker
## 4 25 Rocketman
## 5 31 The Lion King
## 6 43 Aladdin
## 7 45 Us
## 8 57 It Chapter Two
## Description
## <chr>
## 1 After the devastating events of Avengers: Infinity War (2018), the universe i…
## 2 A detective investigates the death of a patriarch of an eccentric, combative …
## 3 In Gotham City, mentally troubled comedian Arthur Fleck is disregarded and mi…
## 4 A musical fantasy about the fantastical human story of Elton John's breakthro…
## 5 After the murder of his father, a young lion prince flees his kingdom only to…
## 6 A kind-hearted street urchin and a power-hungry Grand Vizier vie for a magic …
## 7 A family's serene beach vacation turns to chaos when their doppelgängers appe…
## 8 Twenty-seven years after their first encounter with the terrifying Pennywise,…
## Runtime Genre Rating Metascore Votes Gross_Earning_in_Mil Director
## <int> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 181 Action 8.4 78 810893 858. Anthony Russo
## 2 130 Comedy 7.9 82 454854 165. Rian Johnson
## 3 122 Crime 8.5 59 940168 335. Todd Phillips
## 4 121 Biography 7.3 69 137492 96.4 Dexter Fletcher
## 5 118 Animation 6.9 55 212343 544. Jon Favreau
## 6 128 Adventure 6.9 53 227804 356. Guy Ritchie
## 7 116 Horror 6.8 81 227434 175. Jordan Peele
## 8 169 Drama 6.5 58 207878 212. Andy Muschietti
## Actor
## <chr>
## 1 Robert Downey Jr.
## 2 Daniel Craig
## 3 Joaquin Phoenix
## 4 Taron Egerton
## 5 Donald Glover
## 6 Will Smith
## 7 Lupita Nyong'o
## 8 Jessica Chastain
ggplot(movies, mapping = aes(x = Rating, y = Gross_Earning_in_Mil)) +
geom_point(mapping = aes(size = Votes, color = Genre)) +
ggrepel::geom_label_repel(aes(label = Title), data = best_in_genre) +
labs(y = "Gross earning in millions")
## Warning: Removed 44 rows containing missing values (geom_point).
quantmod package contains many utility functions for retrieving and plotting finance data. E.g.,
library(quantmod)
stock <- getSymbols("GOOG", src = "yahoo", auto.assign = FALSE, from = "2020-01-01")
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
head(stock)
## GOOG.Open GOOG.High GOOG.Low GOOG.Close GOOG.Volume GOOG.Adjusted
## 2020-01-02 1341.55 1368.14 1341.550 1367.37 1406600 1367.37
## 2020-01-03 1347.86 1372.50 1345.544 1360.66 1186400 1360.66
## 2020-01-06 1350.00 1396.50 1350.000 1394.21 1732300 1394.21
## 2020-01-07 1397.94 1402.99 1390.380 1393.34 1502700 1393.34
## 2020-01-08 1392.08 1411.58 1390.840 1404.32 1528000 1404.32
## 2020-01-09 1420.57 1427.33 1410.270 1419.83 1500900 1419.83
chartSeries(stock, theme = chartTheme("white"),
type = "line", log.scale = FALSE, TA = NULL)
Read blog: https://towardsdatascience.com/pulling-tweets-into-r-e17d4981cfe2
twitteR package is useful for pulling tweets text data into R.
library(twitteR) #load package
Step 1: apply for a Twitter developer account. It takes some time to get approved.
Step 2: Generate and copy the Twitter App Keys.
consumer_key <- 'XXXXXXXXXX'
consumer_secret <- 'XXXXXXXXXX'
access_token <- 'XXXXXXXXXX'
access_secret <- 'XXXXXXXXXX'
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
virus <- searchTwitter('#China + #Coronavirus',
n = 1000,
since = '2020-01-01',
retryOnRateLimit = 1e3)
virus_df <- as_tibble(twListToDF(virus))
virus_df %>% print(width = Inf)